Environmental data (n=10) Medical data (n=7)
You can find metadata here: https://github.com/HeatherWelch/melanoma
With individual linear models fit to each pair
source('/Users/heatherwelch/Dropbox/melenoma/melanoma_GitHub/utilities/load_libraries.R')
modDF=read.csv("/Users/heatherwelch/Dropbox/melenoma/melanoma_GitHub/model_data/data_01_30_20.csv")
env=modDF %>% dplyr::select(SEER_rate,anRange_temperature,cancer_gov_UV_exposure,mean_cloud,elevation,mean_temperature,seasonality_cloud,seasonality_temperature,sun_exposure,UV_daily_dose,UV_irradiance)
master_env=env %>% gather(variable, value,-SEER_rate)
C3=ggplot(master_env,aes(x=SEER_rate,y=value),color="black")+geom_point(size=1)+stat_smooth(se=F,method = "lm",formula = y~x)+
theme(text = element_text(size=8),axis.text = element_text(size=8),plot.title = element_text(hjust=0,size=8),legend.position=c(.15,.3),legend.justification = c(.9,.9),legend.key.size = unit(.5,'lines'))+
theme(legend.background = element_blank(),legend.box.background = element_rect(colour = NA),legend.margin=unit(0.3, "lines"))+theme_classic()+
ggtitle("Environment")+
ylab("Predictor")+xlab("Melanoma rate")+facet_wrap(~variable,scales="free")
C3
jur=modDF %>% dplyr::select(-c(anRange_temperature,cancer_gov_UV_exposure,mean_cloud,elevation,mean_temperature,seasonality_cloud,seasonality_temperature,sun_exposure,UV_daily_dose,UV_irradiance))
master_jur=jur %>% gather(variable, value,-SEER_rate)
C3=ggplot(master_jur,aes(x=SEER_rate,y=value),color="black")+geom_point(size=1)+stat_smooth(se=F,method = "lm",formula = y~x)+
theme(text = element_text(size=8),axis.text = element_text(size=8),plot.title = element_text(hjust=0,size=8),legend.position=c(.15,.3),legend.justification = c(.9,.9),legend.key.size = unit(.5,'lines'))+
theme(legend.background = element_blank(),legend.box.background = element_rect(colour = NA),legend.margin=unit(0.3, "lines"))+theme_classic()+
ggtitle("Medical")+
ylab("Predictor")+xlab("Melanoma rate")+facet_wrap(~variable,scales="free")
C3
M <- cor(modDF)
corrplot(M,type="upper",order="hclust",outline = T,tl.col="black",tl.cex = .9)
How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)
library(mgcv)
a=glm(SEER_rate~anRange_temperature+cancer_gov_UV_exposure+mean_cloud+elevation+mean_temperature+seasonality_cloud+seasonality_temperature+sun_exposure+UV_daily_dose+UV_irradiance,data=modDF)
summary(a)
##
## Call:
## glm(formula = SEER_rate ~ anRange_temperature + cancer_gov_UV_exposure +
## mean_cloud + elevation + mean_temperature + seasonality_cloud +
## seasonality_temperature + sun_exposure + UV_daily_dose +
## UV_irradiance, data = modDF)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -56.936 -10.095 -1.911 9.052 82.217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 399.181082 89.142525 4.478 8.77e-06 ***
## anRange_temperature -0.136452 0.088615 -1.540 0.124045
## cancer_gov_UV_exposure 0.014442 0.009583 1.507 0.132245
## mean_cloud -0.064671 0.240035 -0.269 0.787683
## elevation -0.002797 0.004298 -0.651 0.515346
## mean_temperature -0.096078 0.076208 -1.261 0.207817
## seasonality_cloud -0.823232 0.339750 -2.423 0.015638 *
## seasonality_temperature -0.001326 0.002895 -0.458 0.646982
## sun_exposure -0.014480 0.012170 -1.190 0.234487
## UV_daily_dose 0.217959 0.065034 3.351 0.000846 ***
## UV_irradiance -4.192563 1.268161 -3.306 0.000994 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 299.1332)
##
## Null deviance: 240859 on 726 degrees of freedom
## Residual deviance: 214179 on 716 degrees of freedom
## AIC: 6220.6
##
## Number of Fisher Scoring iterations: 2
env_glm_aic=a$aic
c=summary(a)
env_glm_r2=1-(c$deviance/c$null.deviance)
This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)
library(effects)
for(i in 1:10){
plot(predictorEffects(a)[i],cex=.1)
}
You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)
library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
## Overall Var
## 1 0.2694217 mean_cloud
## 2 0.4581517 seasonality_temperature
## 3 0.6508603 elevation
## 4 1.1898816 sun_exposure
## 5 1.2607303 mean_temperature
## 6 1.5070265 cancer_gov_UV_exposure
## 7 1.5398231 anRange_temperature
## 8 2.4230526 seasonality_cloud
## 9 3.3060182 UV_irradiance
## 10 3.3514550 UV_daily_dose
b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b)
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))
ratio_env=sum(x$SEER_rate)/sum(x$predicted)
RMSE_env=RMSE(x$predicted,x$SEER_rate)
How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)
library(mgcv)
a=glm(SEER_rate~incm_pc+wpovr50+incm_mh+derm_pk+pcp_pk+docs_pk+wpvr100,data=modDF)
summary(a)
##
## Call:
## glm(formula = SEER_rate ~ incm_pc + wpovr50 + incm_mh + derm_pk +
## pcp_pk + docs_pk + wpvr100, data = modDF)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -45.111 -10.902 -1.226 7.840 77.733
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.8331145 4.9162193 4.034 6.06e-05 ***
## incm_pc -0.0002332 0.0000871 -2.678 0.007578 **
## wpovr50 0.3349817 0.1678134 1.996 0.046294 *
## incm_mh 0.0003333 0.0001139 2.925 0.003551 **
## derm_pk 1.5139199 0.4574544 3.309 0.000981 ***
## pcp_pk 0.1067555 0.0379640 2.812 0.005057 **
## docs_pk -0.0286660 0.0128139 -2.237 0.025586 *
## wpvr100 0.0281144 0.1862127 0.151 0.880034
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 262.8892)
##
## Null deviance: 240859 on 726 degrees of freedom
## Residual deviance: 189017 on 719 degrees of freedom
## AIC: 6123.7
##
## Number of Fisher Scoring iterations: 2
med_glm_aic=a$aic
c=summary(a)
med_glm_r2=1-(c$deviance/c$null.deviance)
This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)
library(effects)
for(i in 1:7){
plot(predictorEffects(a)[i],cex=.1)
}
You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)
library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
## Overall Var
## 1 0.1509802 wpvr100
## 2 1.9961555 wpovr50
## 3 2.2370981 docs_pk
## 4 2.6778729 incm_pc
## 5 2.8120166 pcp_pk
## 6 2.9251561 incm_mh
## 7 3.3094446 derm_pk
b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b)
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))
ratio_med=sum(x$SEER_rate)/sum(x$predicted)
RMSE_med=RMSE(x$predicted,x$SEER_rate)
How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)
library(mgcv)
a=glm(SEER_rate~incm_pc+wpovr50+incm_mh+derm_pk+pcp_pk+docs_pk+wpvr100+anRange_temperature+cancer_gov_UV_exposure+mean_cloud+elevation+mean_temperature+seasonality_cloud+seasonality_temperature+sun_exposure+UV_daily_dose+UV_irradiance,data=modDF)
summary(a)
##
## Call:
## glm(formula = SEER_rate ~ incm_pc + wpovr50 + incm_mh + derm_pk +
## pcp_pk + docs_pk + wpvr100 + anRange_temperature + cancer_gov_UV_exposure +
## mean_cloud + elevation + mean_temperature + seasonality_cloud +
## seasonality_temperature + sun_exposure + UV_daily_dose +
## UV_irradiance, data = modDF)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -58.679 -9.766 -0.545 8.086 78.393
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.804e+02 8.156e+01 4.664 3.71e-06 ***
## incm_pc -1.085e-04 9.428e-05 -1.151 0.250312
## wpovr50 4.951e-01 1.773e-01 2.792 0.005374 **
## incm_mh 4.748e-04 1.252e-04 3.792 0.000162 ***
## derm_pk 9.326e-01 4.480e-01 2.082 0.037739 *
## pcp_pk 8.814e-02 3.724e-02 2.367 0.018198 *
## docs_pk -1.328e-02 1.273e-02 -1.043 0.297343
## wpvr100 -4.183e-01 2.221e-01 -1.883 0.060075 .
## anRange_temperature -2.532e-02 8.108e-02 -0.312 0.754941
## cancer_gov_UV_exposure -2.651e-04 8.685e-03 -0.031 0.975657
## mean_cloud 5.570e-01 2.289e-01 2.434 0.015192 *
## elevation -8.486e-04 3.953e-03 -0.215 0.830082
## mean_temperature 3.939e-02 7.200e-02 0.547 0.584541
## seasonality_cloud -6.875e-01 3.165e-01 -2.172 0.030188 *
## seasonality_temperature -2.923e-03 2.658e-03 -1.100 0.271805
## sun_exposure 1.227e-03 1.111e-02 0.110 0.912114
## UV_daily_dose 2.639e-01 5.909e-02 4.466 9.29e-06 ***
## UV_irradiance -4.971e+00 1.150e+00 -4.321 1.78e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 240.0592)
##
## Null deviance: 240859 on 726 degrees of freedom
## Residual deviance: 170202 on 709 degrees of freedom
## AIC: 6067.5
##
## Number of Fisher Scoring iterations: 2
envmed_glm_aic=a$aic
c=summary(a)
envmed_glm_r2=1-(c$deviance/c$null.deviance)
This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)
library(effects)
for(i in 1:17){
plot(predictorEffects(a)[i],cex=.1)
}
You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)
library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
## Overall Var
## 1 0.03052533 cancer_gov_UV_exposure
## 2 0.11041185 sun_exposure
## 3 0.21467593 elevation
## 4 0.31225242 anRange_temperature
## 5 0.54701394 mean_temperature
## 6 1.04291545 docs_pk
## 7 1.09977129 seasonality_temperature
## 8 1.15053259 incm_pc
## 9 1.88325468 wpvr100
## 10 2.08157945 derm_pk
## 11 2.17198227 seasonality_cloud
## 12 2.36705777 pcp_pk
## 13 2.43367340 mean_cloud
## 14 2.79230559 wpovr50
## 15 3.79171390 incm_mh
## 16 4.32096854 UV_irradiance
## 17 4.46560857 UV_daily_dose
b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b)
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))
ratio_envmed=sum(x$SEER_rate)/sum(x$predicted)
RMSE_envmed=RMSE(x$predicted,x$SEER_rate)
Some AIC principles:
Lower indicates a more parsimonious model, relative to a model fit with a higher AIC.
It is a relative measure of model parsimony, so it only has meaning if we compare the AIC for alternate hypotheses (= different models of the data).
Some RMSE principles:
RMSE (residual mean square error) represents the model prediction error, that is the average difference the observed outcome values and the predicted outcome values.
AIC=list(env_glm_aic,med_glm_aic,envmed_glm_aic)
r2=list(env_glm_r2,med_glm_r2,envmed_glm_r2)
ratio=list(ratio_env,ratio_med,ratio_envmed)
RMSE=list(RMSE_env,RMSE_med,RMSE_envmed)
names=list("Environment","Medical","Combined")
dat=data.frame(AIC=as.numeric(AIC),
r2=as.numeric(r2),
ratio=as.numeric(ratio),
RMSE=as.numeric(RMSE),
model=as.character(names),
stringsAsFactors = F)
dat
## AIC r2 ratio RMSE model
## 1 6220.599 0.1107697 1 17.16412 Environment
## 2 6123.742 0.2152378 1 16.12440 Medical
## 3 6067.514 0.2933552 1 15.30083 Combined
I am not sure if this is the best way to do this, but I’m trying to match your interest of only allowing model responses to curve once. Normally I allow models to wiggle a lot more than this. Essentially these are the exact same models as above, but they can follow a quadratic function: http://dl.uncw.edu/digilib/Mathematics/Algebra/mat111hb/PandR/quadratic/quadratic.html
How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)
library(mgcv)
a=glm(SEER_rate~I(anRange_temperature^2)+I(cancer_gov_UV_exposure^2)+I(mean_cloud^2)+I(elevation^2)+I(mean_temperature^2)+I(seasonality_cloud^2)+I(seasonality_temperature^2)+I(sun_exposure^2)+I(UV_daily_dose^2)+I(UV_irradiance^2),data=modDF)
summary(a)
##
## Call:
## glm(formula = SEER_rate ~ I(anRange_temperature^2) + I(cancer_gov_UV_exposure^2) +
## I(mean_cloud^2) + I(elevation^2) + I(mean_temperature^2) +
## I(seasonality_cloud^2) + I(seasonality_temperature^2) + I(sun_exposure^2) +
## I(UV_daily_dose^2) + I(UV_irradiance^2), data = modDF)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -49.520 -10.542 -2.127 8.555 77.322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.526e+01 1.613e+01 5.287 1.65e-07 ***
## I(anRange_temperature^2) -1.740e-04 1.499e-04 -1.161 0.24604
## I(cancer_gov_UV_exposure^2) 1.482e-06 9.949e-07 1.489 0.13688
## I(mean_cloud^2) -7.084e-03 1.518e-03 -4.668 3.64e-06 ***
## I(elevation^2) 1.328e-06 1.388e-06 0.956 0.33932
## I(mean_temperature^2) -7.184e-04 2.426e-04 -2.961 0.00317 **
## I(seasonality_cloud^2) -2.364e-02 1.419e-02 -1.665 0.09631 .
## I(seasonality_temperature^2) -1.476e-07 2.075e-07 -0.712 0.47698
## I(sun_exposure^2) -7.393e-06 3.119e-06 -2.370 0.01803 *
## I(UV_daily_dose^2) -1.009e-05 5.695e-06 -1.772 0.07689 .
## I(UV_irradiance^2) 1.477e-03 1.071e-03 1.379 0.16831
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 292.6268)
##
## Null deviance: 240859 on 726 degrees of freedom
## Residual deviance: 209521 on 716 degrees of freedom
## AIC: 6204.6
##
## Number of Fisher Scoring iterations: 2
env_glm_aic=a$aic
c=summary(a)
env_glm_r2=1-(c$deviance/c$null.deviance)
This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)
library(effects)
for(i in 1:10){
plot(predictorEffects(a)[i],cex=.1)
}
You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)
library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
## Overall Var
## 1 0.7115420 I(seasonality_temperature^2)
## 2 0.9561441 I(elevation^2)
## 3 1.1609668 I(anRange_temperature^2)
## 4 1.3790514 I(UV_irradiance^2)
## 5 1.4891887 I(cancer_gov_UV_exposure^2)
## 6 1.6652203 I(seasonality_cloud^2)
## 7 1.7715736 I(UV_daily_dose^2)
## 8 2.3704288 I(sun_exposure^2)
## 9 2.9611259 I(mean_temperature^2)
## 10 4.6676791 I(mean_cloud^2)
b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b)
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))
ratio_env=sum(x$SEER_rate)/sum(x$predicted)
RMSE_env=RMSE(x$predicted,x$SEER_rate)
How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)
library(mgcv)
a=glm(SEER_rate~I(incm_pc^2)+I(wpovr50^2)+I(incm_mh^2)+I(derm_pk^2)+I(pcp_pk^2)+I(docs_pk^2)+I(wpvr100^2),data=modDF)
summary(a)
##
## Call:
## glm(formula = SEER_rate ~ I(incm_pc^2) + I(wpovr50^2) + I(incm_mh^2) +
## I(derm_pk^2) + I(pcp_pk^2) + I(docs_pk^2) + I(wpvr100^2),
## data = modDF)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -45.888 -10.647 -1.455 8.413 80.374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.953e+01 2.366e+00 12.481 < 2e-16 ***
## I(incm_pc^2) -6.348e-10 7.467e-10 -0.850 0.3955
## I(wpovr50^2) 5.918e-03 1.375e-03 4.302 1.92e-05 ***
## I(incm_mh^2) 1.442e-09 1.032e-09 1.397 0.1630
## I(derm_pk^2) 5.821e-02 3.548e-02 1.641 0.1013
## I(pcp_pk^2) 5.734e-04 2.222e-04 2.581 0.0101 *
## I(docs_pk^2) -3.008e-05 1.600e-05 -1.880 0.0605 .
## I(wpvr100^2) -2.814e-03 3.162e-03 -0.890 0.3737
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 268.3403)
##
## Null deviance: 240859 on 726 degrees of freedom
## Residual deviance: 192937 on 719 degrees of freedom
## AIC: 6138.7
##
## Number of Fisher Scoring iterations: 2
med_glm_aic=a$aic
c=summary(a)
med_glm_r2=1-(c$deviance/c$null.deviance)
This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)
library(effects)
for(i in 1:7){
plot(predictorEffects(a)[i],cex=.1)
}
You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)
library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
## Overall Var
## 1 0.8501913 I(incm_pc^2)
## 2 0.8901693 I(wpvr100^2)
## 3 1.3965039 I(incm_mh^2)
## 4 1.6407766 I(derm_pk^2)
## 5 1.8801337 I(docs_pk^2)
## 6 2.5806500 I(pcp_pk^2)
## 7 4.3022989 I(wpovr50^2)
b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b)
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))
ratio_med=sum(x$SEER_rate)/sum(x$predicted)
RMSE_med=RMSE(x$predicted,x$SEER_rate)
How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)
library(mgcv)
a=glm(SEER_rate~I(incm_pc^2)+I(wpovr50^2)+I(incm_mh^2)+I(derm_pk^2)+I(pcp_pk^2)+I(docs_pk^2)+I(wpvr100^2)+I(anRange_temperature^2)+I(cancer_gov_UV_exposure^2)+I(mean_cloud^2)+I(elevation^2)+I(mean_temperature^2)+I(seasonality_cloud^2)+I(seasonality_temperature^2)+I(sun_exposure^2)+I(UV_daily_dose^2)+I(UV_irradiance^2),data=modDF)
summary(a)
##
## Call:
## glm(formula = SEER_rate ~ I(incm_pc^2) + I(wpovr50^2) + I(incm_mh^2) +
## I(derm_pk^2) + I(pcp_pk^2) + I(docs_pk^2) + I(wpvr100^2) +
## I(anRange_temperature^2) + I(cancer_gov_UV_exposure^2) +
## I(mean_cloud^2) + I(elevation^2) + I(mean_temperature^2) +
## I(seasonality_cloud^2) + I(seasonality_temperature^2) + I(sun_exposure^2) +
## I(UV_daily_dose^2) + I(UV_irradiance^2), data = modDF)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -54.317 -9.602 -0.842 8.188 75.334
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.081e+01 1.618e+01 0.668 0.504404
## I(incm_pc^2) 5.665e-10 7.596e-10 0.746 0.456074
## I(wpovr50^2) 9.161e-03 1.405e-03 6.520 1.34e-10 ***
## I(incm_mh^2) 2.124e-09 1.111e-09 1.911 0.056422 .
## I(derm_pk^2) 1.885e-02 3.431e-02 0.549 0.582968
## I(pcp_pk^2) 4.593e-04 2.140e-04 2.147 0.032150 *
## I(docs_pk^2) -9.458e-06 1.556e-05 -0.608 0.543589
## I(wpvr100^2) -1.401e-02 3.710e-03 -3.775 0.000173 ***
## I(anRange_temperature^2) 5.755e-05 1.385e-04 0.415 0.677938
## I(cancer_gov_UV_exposure^2) -5.054e-07 9.308e-07 -0.543 0.587338
## I(mean_cloud^2) -8.118e-04 1.528e-03 -0.531 0.595479
## I(elevation^2) -7.988e-07 1.270e-06 -0.629 0.529609
## I(mean_temperature^2) -9.185e-04 2.239e-04 -4.102 4.56e-05 ***
## I(seasonality_cloud^2) -1.149e-02 1.335e-02 -0.860 0.389923
## I(seasonality_temperature^2) -3.519e-07 1.909e-07 -1.843 0.065712 .
## I(sun_exposure^2) 8.655e-07 2.917e-06 0.297 0.766764
## I(UV_daily_dose^2) -2.972e-05 5.488e-06 -5.416 8.35e-08 ***
## I(UV_irradiance^2) 5.567e-03 1.044e-03 5.330 1.32e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 238.4376)
##
## Null deviance: 240859 on 726 degrees of freedom
## Residual deviance: 169052 on 709 degrees of freedom
## AIC: 6062.6
##
## Number of Fisher Scoring iterations: 2
envmed_glm_aic=a$aic
c=summary(a)
envmed_glm_r2=1-(c$deviance/c$null.deviance)
This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)
library(effects)
for(i in 1:17){
plot(predictorEffects(a)[i],cex=.1)
}
You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)
library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
## Overall Var
## 1 0.2967236 I(sun_exposure^2)
## 2 0.4154509 I(anRange_temperature^2)
## 3 0.5311536 I(mean_cloud^2)
## 4 0.5429457 I(cancer_gov_UV_exposure^2)
## 5 0.5493062 I(derm_pk^2)
## 6 0.6076885 I(docs_pk^2)
## 7 0.6289122 I(elevation^2)
## 8 0.7457355 I(incm_pc^2)
## 9 0.8602838 I(seasonality_cloud^2)
## 10 1.8432374 I(seasonality_temperature^2)
## 11 1.9108884 I(incm_mh^2)
## 12 2.1467680 I(pcp_pk^2)
## 13 3.7748888 I(wpvr100^2)
## 14 4.1024103 I(mean_temperature^2)
## 15 5.3301649 I(UV_irradiance^2)
## 16 5.4160841 I(UV_daily_dose^2)
## 17 6.5198669 I(wpovr50^2)
b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b)
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))
ratio_envmed=sum(x$SEER_rate)/sum(x$predicted)
RMSE_envmed=RMSE(x$predicted,x$SEER_rate)
Some AIC principles:
Lower indicates a more parsimonious model, relative to a model fit with a higher AIC.
It is a relative measure of model parsimony, so it only has meaning if we compare the AIC for alternate hypotheses (= different models of the data).
Some RMSE principles:
RMSE (residual mean square error) represents the model prediction error, that is the average difference the observed outcome values and the predicted outcome values.
AIC=list(env_glm_aic,med_glm_aic,envmed_glm_aic)
r2=list(env_glm_r2,med_glm_r2,envmed_glm_r2)
ratio=list(ratio_env,ratio_med,ratio_envmed)
RMSE=list(RMSE_env,RMSE_med,RMSE_envmed)
names=list("Environment","Medical","Combined")
dat=data.frame(AIC=as.numeric(AIC),
r2=as.numeric(r2),
ratio=as.numeric(ratio),
RMSE=as.numeric(RMSE),
model=as.character(names),
stringsAsFactors = F)
dat
## AIC r2 ratio RMSE model
## 1 6204.611 0.1301115 1 16.97643 Environment
## 2 6138.662 0.1989654 1 16.29072 Medical
## 3 6062.586 0.2981285 1 15.24907 Combined
These are going to use “shrinkage” to assess the optimal wiggliness of model fits. Wiggle are fit using thin plate regression splines. These models will also effectively remove predictors from the model if they are not contributing significantly.
How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)
library(mgcv)
a=gam(SEER_rate~s(anRange_temperature,bs="ts")+s(cancer_gov_UV_exposure,bs="ts")+s(mean_cloud,bs="ts")+s(elevation,bs="ts")+s(mean_temperature,bs="ts")+s(seasonality_cloud,bs="ts")+s(seasonality_temperature,bs="ts")+s(sun_exposure,bs="ts")+s(UV_daily_dose,bs="ts")+s(UV_irradiance,bs="ts"),data=modDF)
summary(a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## SEER_rate ~ s(anRange_temperature, bs = "ts") + s(cancer_gov_UV_exposure,
## bs = "ts") + s(mean_cloud, bs = "ts") + s(elevation, bs = "ts") +
## s(mean_temperature, bs = "ts") + s(seasonality_cloud, bs = "ts") +
## s(seasonality_temperature, bs = "ts") + s(sun_exposure, bs = "ts") +
## s(UV_daily_dose, bs = "ts") + s(UV_irradiance, bs = "ts")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.4469 0.5312 91.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(anRange_temperature) 7.306 9 3.662 2.40e-06 ***
## s(cancer_gov_UV_exposure) 1.835 9 0.118 0.522023
## s(mean_cloud) 8.425 9 3.761 1.02e-05 ***
## s(elevation) 1.248 9 1.162 0.000271 ***
## s(mean_temperature) 7.761 9 4.023 7.65e-07 ***
## s(seasonality_cloud) 5.303 9 1.891 0.001379 **
## s(seasonality_temperature) 7.624 9 3.449 6.66e-06 ***
## s(sun_exposure) 2.434 9 0.602 0.048073 *
## s(UV_daily_dose) 8.178 9 9.153 < 2e-16 ***
## s(UV_irradiance) 0.477 9 0.077 0.194512
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.382 Deviance explained = 42.5%
## GCV = 220.79 Scale est. = 205.12 n = 727
env_glm_aic=AIC(a)
env_glm_r2=1-(a$deviance/a$null.deviance)
This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)
plot(a,pages=1)
You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)
library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
## Overall Var
## 1 0.2823104 cancer_gov_UV_exposure
## 2 0.7110526 UV_irradiance
## 3 1.3180956 sun_exposure
## 4 2.8604232 seasonality_cloud
## 5 3.5667163 elevation
## 6 4.9928597 mean_cloud
## 7 5.1765637 seasonality_temperature
## 8 5.6192172 anRange_temperature
## 9 6.1163033 mean_temperature
## 10 19.4286349 UV_daily_dose
b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b)
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))
ratio_env=sum(x$SEER_rate)/sum(x$predicted)
RMSE_env=RMSE(x$predicted,x$SEER_rate)
How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)
library(mgcv)
a=gam(SEER_rate~s(incm_pc,bs="ts")+s(wpovr50,bs="ts")+s(incm_mh,bs="ts")+s(derm_pk,bs="ts")+s(pcp_pk,bs="ts")+s(docs_pk,bs="ts")+s(wpvr100,bs="ts"),data=modDF)
summary(a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## SEER_rate ~ s(incm_pc, bs = "ts") + s(wpovr50, bs = "ts") + s(incm_mh,
## bs = "ts") + s(derm_pk, bs = "ts") + s(pcp_pk, bs = "ts") +
## s(docs_pk, bs = "ts") + s(wpvr100, bs = "ts")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.4469 0.5836 83.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(incm_pc) 4.9313 9 2.980 8.01e-06 ***
## s(wpovr50) 4.2884 9 1.386 0.003095 **
## s(incm_mh) 1.6059 9 0.912 0.005037 **
## s(derm_pk) 0.9919 9 1.119 0.000837 ***
## s(pcp_pk) 4.9516 9 2.660 4.60e-05 ***
## s(docs_pk) 0.7975 9 0.366 0.038217 *
## s(wpvr100) 3.4613 9 0.876 0.026058 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.254 Deviance explained = 27.5%
## GCV = 255.35 Scale est. = 247.62 n = 727
med_glm_aic=AIC(a)
med_glm_r2=1-(a$deviance/a$null.deviance)
This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)
plot(a,pages=1)
You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)
library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
## Overall Var
## 1 1.417746 docs_pk
## 2 1.584052 wpvr100
## 3 2.297864 incm_mh
## 4 2.509334 wpovr50
## 5 3.077054 derm_pk
## 6 4.337546 pcp_pk
## 7 5.096382 incm_pc
b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b)
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))
ratio_med=sum(x$SEER_rate)/sum(x$predicted)
RMSE_med=RMSE(x$predicted,x$SEER_rate)
How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)
library(mgcv)
a=gam(SEER_rate~s(anRange_temperature,bs="ts")+s(cancer_gov_UV_exposure,bs="ts")+s(mean_cloud,bs="ts")+s(elevation,bs="ts")+s(mean_temperature,bs="ts")+s(seasonality_cloud,bs="ts")+s(seasonality_temperature,bs="ts")+s(sun_exposure,bs="ts")+s(UV_daily_dose,bs="ts")+s(UV_irradiance,bs="ts")+s(incm_pc,bs="ts")+s(wpovr50,bs="ts")+s(incm_mh,bs="ts")+s(derm_pk,bs="ts")+s(pcp_pk,bs="ts")+s(docs_pk,bs="ts")+s(wpvr100,bs="ts"),data=modDF)
summary(a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## SEER_rate ~ s(anRange_temperature, bs = "ts") + s(cancer_gov_UV_exposure,
## bs = "ts") + s(mean_cloud, bs = "ts") + s(elevation, bs = "ts") +
## s(mean_temperature, bs = "ts") + s(seasonality_cloud, bs = "ts") +
## s(seasonality_temperature, bs = "ts") + s(sun_exposure, bs = "ts") +
## s(UV_daily_dose, bs = "ts") + s(UV_irradiance, bs = "ts") +
## s(incm_pc, bs = "ts") + s(wpovr50, bs = "ts") + s(incm_mh,
## bs = "ts") + s(derm_pk, bs = "ts") + s(pcp_pk, bs = "ts") +
## s(docs_pk, bs = "ts") + s(wpvr100, bs = "ts")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.4469 0.4568 106.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(anRange_temperature) 7.747e+00 9 5.195 1.67e-08 ***
## s(cancer_gov_UV_exposure) 4.836e+00 9 1.071 0.041169 *
## s(mean_cloud) 8.170e+00 9 4.495 5.70e-07 ***
## s(elevation) 1.405e+00 9 2.355 1.05e-06 ***
## s(mean_temperature) 8.571e+00 9 7.190 1.29e-11 ***
## s(seasonality_cloud) 5.334e+00 9 1.527 0.008824 **
## s(seasonality_temperature) 7.027e+00 9 3.783 3.33e-06 ***
## s(sun_exposure) 1.116e-06 9 0.000 0.508896
## s(UV_daily_dose) 8.295e+00 9 3.725 1.54e-05 ***
## s(UV_irradiance) 8.658e+00 9 3.052 0.000238 ***
## s(incm_pc) 5.956e-01 9 0.164 0.107798
## s(wpovr50) 1.539e-07 9 0.000 0.975020
## s(incm_mh) 7.612e-01 9 0.573 0.007799 **
## s(derm_pk) 3.447e-05 9 0.000 0.427150
## s(pcp_pk) 7.518e+00 9 3.615 2.24e-05 ***
## s(docs_pk) 2.646e-05 9 0.000 0.453685
## s(wpvr100) 7.199e+00 9 3.981 1.48e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.543 Deviance explained = 59.1%
## GCV = 169.72 Scale est. = 151.72 n = 727
envmed_glm_aic=AIC(a)
envmed_glm_r2=1-(a$deviance/a$null.deviance)
This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)
plot(a,pages=1)
You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)
library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
## Overall Var
## 1 0.01098665 wpovr50
## 2 0.29337106 sun_exposure
## 3 0.34324602 docs_pk
## 4 0.36941947 derm_pk
## 5 0.96738759 incm_pc
## 6 1.38542445 cancer_gov_UV_exposure
## 7 2.05433312 seasonality_cloud
## 8 2.10795161 incm_mh
## 9 3.62282430 UV_irradiance
## 10 4.64940077 pcp_pk
## 11 4.81234109 UV_daily_dose
## 12 5.47767891 seasonality_temperature
## 13 5.82842867 wpvr100
## 14 5.97778796 elevation
## 15 6.24429079 mean_cloud
## 16 7.77761027 anRange_temperature
## 17 10.88884578 mean_temperature
b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b)
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))
ratio_envmed=sum(x$SEER_rate)/sum(x$predicted)
RMSE_envmed=RMSE(x$predicted,x$SEER_rate)
Some AIC principles:
Lower indicates a more parsimonious model, relative to a model fit with a higher AIC.
It is a relative measure of model parsimony, so it only has meaning if we compare the AIC for alternate hypotheses (= different models of the data).
Some RMSE principles:
RMSE (residual mean square error) represents the model prediction error, that is the average difference the observed outcome values and the predicted outcome values.
AIC=list(env_glm_aic,med_glm_aic,envmed_glm_aic)
r2=list(env_glm_r2,med_glm_r2,envmed_glm_r2)
ratio=list(ratio_env,ratio_med,ratio_envmed)
RMSE=list(RMSE_env,RMSE_med,RMSE_envmed)
names=list("Environment","Medical","Combined")
dat=data.frame(AIC=as.numeric(AIC),
r2=as.numeric(r2),
ratio=as.numeric(ratio),
RMSE=as.numeric(RMSE),
model=as.character(names),
stringsAsFactors = F)
dat
## AIC r2 ratio RMSE model
## 1 5985.071 0.4247986 1 13.80462 Environment
## 2 6093.957 0.2752547 1 15.49556 Medical
## 3 5788.859 0.5906384 1 11.64577 Combined